# Copyright (c) 2020 Jacobo Pardo Seco
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

x<-x[order(x[,"date"],decreasing=F),]
var<-unique(unlist(strsplit(x[,"Last.Version.Variants"],split=",")))
vm<-matrix(0,nr=nrow(x),nc=length(var));for(i in 1:length(var)){vm[grep(var[i],x[,"Last.Version.Variants"]),i]<-1};colnames(vm)<-var
u<-sort(unique(x[,"region_exposure"]));sq_div<-as.numeric()
#######################################################################################################################################
# Sequence diversity index
##############################################################################################################################
u<-sort(unique(x[,"region_exposure"]))
out1<-list()
for(reg in 1:length(u)){
	w_reg<-which(x[,"region_exposure"]==u[reg])
	date<-unique(x[w_reg,"date"])
	sq_div<-as.numeric()
	for(day in 1:length(date)){
		w<-which(x[w_reg,"date"]%in%date[1:day])
		if(length(w)>1){t0<-apply(vm[w_reg[w],],1,paste,collapse="")}else{t0<-paste(vm[w_reg[w],],collapse="")};t<-table(t0);freq<-prop.table(t)
		if(sum(t)>9){sq_div<-c(sq_div,(1-sum(freq^2))*sum(t)/(sum(t)-1))}else{sq_div<-c(sq_div,NA)}
	}
	out1[[reg]]<-cbind(sq_div,as.character(date))
}

date<-unique(x[,"date"])
sq_div<-as.numeric()
for(day in 1:length(date)){
	w<-which(x[,"date"]%in%date[1:day])
	if(length(w)>1){t0<-apply(vm[w,],1,paste,collapse="")}else{t0<-paste(vm[w,],collapse="")};t<-table(t0);freq<-prop.table(t)
	if(sum(t)>9){sq_div<-c(sq_div,(1-sum(freq^2))*sum(t)/(sum(t)-1))}else{sq_div<-c(sq_div,NA)}
}
out1[[length(out1)+1]]<-cbind(sq_div,as.character(date))


#######################################################################################################################################
# Nucleotide diversity index
##############################################################################################################################

out2<-list()
for(reg in 1:length(u)){
	x0<-x[which(x[,"region_exposure"]==u[reg]),]
	var<-unique(unlist(strsplit(x0[,"Last.Version.Variants"],split=",")))
	vm<-matrix(0,nr=nrow(x0),nc=length(var));for(i in 1:length(var)){vm[grep(var[i],x0[,"Last.Version.Variants"]),i]<-1};colnames(vm)<-var
	date<-unique(x0[,"date"])
	s_total<-as.numeric()
	for(day in 1:length(date)){
		w<-which(x0[,"date"]%in%date[1:day])
		if(length(w)>1){
			t0<-apply(vm[w,],1,paste,collapse="");t<-table(t0)
			tm<-matrix(unlist(strsplit(names(t),split="")),nr=length(t),byrow=T)
			if(nrow(tm)>1){
				dif<-matrix(nr=length(t),nc=length(t));for (i in 1:(nrow(tm)-1)){for( k in (i+1):nrow(tm)){dif[i,k]<-dif[k,i]<-sum(tm[i,]!=tm[k,])/(29776-169)}}
				rownames(dif)<-colnames(dif)<-names(t)
				freq<-prop.table(t);p<-outer(freq,freq,"*")
				s<-0
				for(i in 1:(length(t)-1)){
					for(j in (i+1):length(t)){
						s<-s+p[i,j]*dif[which(rownames(dif)==names(t)[i]),which(rownames(dif)==names(t)[j])]
					}
				}
				s_total<-c(s_total,2*s*length(t)/(length(t)-1))
			}else{
				s_total<-c(s_total,NA)
			}
		}else{
			s_total<-c(s_total,NA)
		}
	}
	out2[[reg]]<-cbind(s_total,as.character(date))
}

x0<-x
var<-unique(unlist(strsplit(x0[,"Last.Version.Variants"],split=",")))
vm<-matrix(0,nr=nrow(x0),nc=length(var));for(i in 1:length(var)){vm[grep(var[i],x0[,"Last.Version.Variants"]),i]<-1};colnames(vm)<-var
date<-unique(x0[,"date"])
s_total<-as.numeric()
for(day in 1:length(date)){
	w<-which(x0[,"date"]%in%date[1:day])
	if(length(w)>1){
		t0<-apply(vm[w,],1,paste,collapse="");t<-table(t0)
		tm<-matrix(unlist(strsplit(names(t),split="")),nr=length(t),byrow=T)
		if(nrow(tm)>1){
			dif<-matrix(nr=length(t),nc=length(t));for (i in 1:(nrow(tm)-1)){for( k in (i+1):nrow(tm)){dif[i,k]<-dif[k,i]<-sum(tm[i,]!=tm[k,])/29775}}
			rownames(dif)<-colnames(dif)<-names(t)
			freq<-prop.table(t);p<-outer(freq,freq,"*")
			s<-0
			for(i in 1:(length(t)-1)){
				for(j in (i+1):length(t)){
					s<-s+p[i,j]*dif[which(rownames(dif)==names(t)[i]),which(rownames(dif)==names(t)[j])]
				}
			}
			s_total<-c(s_total,2*s*length(t)/(length(t)-1))
		}else{
			s_total<-c(s_total,NA)
		}
	}else{
		s_total<-c(s_total,NA)
	}
}
out2[[length(out2)+1]]<-cbind(s_total,as.character(date))
#######################################################################################################################################
#Tajima D
##############################################################################################################################
out3<-list()
for(reg in 1:length(u)){
	x0<-x[which(x[,"region_exposure"]==u[reg]),]
	var<-unique(unlist(strsplit(x0[,"Last.Version.Variants"],split=",")))
	vm<-matrix(0,nr=nrow(x0),nc=length(var));for(i in 1:length(var)){vm[grep(var[i],x0[,"Last.Version.Variants"]),i]<-1};colnames(vm)<-var
	# Cálculo de las diferencias por pares. Solo llenamos la parte triangular superior
	dif<-matrix(0,nr=nrow(vm),nc=nrow(vm));for(i in 1:(nrow(vm)-1)){for(j in (i+1):nrow(vm)){dif[i,j]<-sum(vm[i,]!=vm[j,])}}
	date<-unique(x0[,"date"])
	D<-nv<-as.numeric()
	for(day in 1:length(date)){
		w<-which(x0[,"date"]%in%date[1:day]);len<-length(w);nv<-c(nv,len)
		if(len>1){
			S<-sum(apply(vm[w,],2,function(z){length(unique(z))>1})==T)
			pi<-sum(dif[w,w])/choose(len,2)
			a1<-sum(1/(1:(length(w)-1)));a2<-sum(1/((1:(length(w)-1))^2))
			b1<-(len+1)/(3*(len-1));b2<-2*(len^2+len+3)/(9*len*(len-1))
			c1<-b1-1/a1;c2<-b2-((len+2)/(a1*len))+a2/(a1^2)
			e1<-c1/a1;e2<-c2/(a1^2+a2)
			D<-c(D,(pi-S/a1)/sqrt(e1*S+e2*S*(S-1)))
		}else{
			D<-c(D,NA)
		}
	}
	out3[[reg]]<-cbind(D,as.character(date),nv)
}
x0<-x
var<-unique(unlist(strsplit(x0[,"Last.Version.Variants"],split=",")))
vm<-matrix(0,nr=nrow(x0),nc=length(var));for(i in 1:length(var)){vm[grep(var[i],x0[,"Last.Version.Variants"]),i]<-1};colnames(vm)<-var
# Cálculo de las diferencias por pares. Solo llenamos la parte triangular superior
dif<-matrix(0,nr=nrow(vm),nc=nrow(vm));for(i in 1:(nrow(vm)-1)){for(j in (i+1):nrow(vm)){dif[i,j]<-sum(vm[i,]!=vm[j,])}}
date<-unique(x0[,"date"])
D<-nv<-as.numeric()
for(day in 1:length(date)){
	w<-which(x0[,"date"]%in%date[1:day]);len<-length(w);;nv<-c(nv,len)
	if(len>1){
		S<-sum(apply(vm[w,],2,function(z){length(unique(z))>1})==T)
		pi<-sum(dif[w,w])/choose(len,2)
		a1<-sum(1/(1:(length(w)-1)));a2<-sum(1/((1:(length(w)-1))^2))
		b1<-(len+1)/(3*(len-1));b2<-2*(len^2+len+3)/(9*len*(len-1))
		c1<-b1-1/a1;c2<-b2-((len+2)/(a1*len))+a2/(a1^2)
		e1<-c1/a1;e2<-c2/(a1^2+a2)
		D<-c(D,(pi-S/a1)/sqrt(e1*S+e2*S*(S-1)))
	}else{
		D<-c(D,NA)
	}
}
out3[[length(out3)+1]]<-cbind(D,as.character(date),nv)
names(out1)<-names(out2)<-names(out3)<-c(u,"Total")
